{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fit Hu et al. matrix factorization to the binarized taste profile dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['OMP_NUM_THREADS'] = '1'   # to not conflict with joblib\n",
    "\n",
    "import numpy as np\n",
    "import scipy.sparse\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import content_wmf\n",
    "import batched_inv_joblib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "unique_uid = list()\n",
    "with open('unique_uid.txt', 'r') as f:\n",
    "    for line in f:\n",
    "        unique_uid.append(line.strip())\n",
    "    \n",
    "unique_sid = list()\n",
    "with open('unique_sid.txt', 'r') as f:\n",
    "    for line in f:\n",
    "        unique_sid.append(line.strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "613682 97414\n"
     ]
    }
   ],
   "source": [
    "n_songs = len(unique_sid)\n",
    "n_users = len(unique_uid)\n",
    "\n",
    "print n_users, n_songs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "92543\n"
     ]
    }
   ],
   "source": [
    "# the last 5% of the songs are in the out-matrix set\n",
    "n_songs = int(0.95 * n_songs)\n",
    "print n_songs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the data and train the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def load_data(csv_file, shape=(n_users, n_songs)):\n",
    "    tp = pd.read_csv(csv_file)\n",
    "    rows, cols = np.array(tp['uid'], dtype=np.int32), np.array(tp['sid'], dtype=np.int32)\n",
    "    count = tp['count']\n",
    "    return scipy.sparse.csr_matrix((count,(rows, cols)), dtype=np.int16, shape=shape), rows, cols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "train_data, rows, cols = load_data('in.train.num.csv')\n",
    "# binarize the data\n",
    "train_data.data = np.ones_like(train_data.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(613682, 92543)\n",
      "(26139017,)\n"
     ]
    }
   ],
   "source": [
    "print train_data.shape\n",
    "print train_data.data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(613682, 92543)\n",
      "(2904335,)\n"
     ]
    }
   ],
   "source": [
    "vad_data, rows_vad, cols_vad = load_data('in.vad.num.csv')\n",
    "# binarize the data\n",
    "vad_data.data = np.ones_like(vad_data.data)\n",
    "print vad_data.shape\n",
    "print vad_data.data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "vad = dict(X_new=vad_data.data,\n",
    "           rows_new=rows_vad,\n",
    "           cols_new=cols_vad)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(613682, 92543)\n",
      "(7260837,)\n"
     ]
    }
   ],
   "source": [
    "test_data, rows_test, cols_test = load_data('in.test.num.csv')\n",
    "# binarize the data\n",
    "test_data.data = np.ones_like(test_data.data)\n",
    "print test_data.shape\n",
    "print test_data.data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "num_factors = 100\n",
    "num_iters = 10\n",
    "batch_size = 10000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "S = content_wmf.log_surplus_confidence_matrix(train_data, alpha=2.0, epsilon=1e-6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lambda_U_reg = 1e-1\n",
    "lambda_V_reg = 1e-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "precompute S^T and X^TX (if necessary)\n",
      "  took 2.182 seconds\n",
      "run ALS algorithm\n",
      "Iteration 0:\n",
      "\tUpdating item factors: time=18.61\n",
      "\tUpdating user factors: time=47.06\n",
      "\tPred likeli: 0.82856\n",
      "Iteration 1:\n",
      "\tUpdating item factors: time=23.62\n",
      "\tUpdating user factors: time=58.10\n",
      "\tPred likeli: 0.59027\n",
      "Iteration 2:\n",
      "\tUpdating item factors: time=24.31\n",
      "\tUpdating user factors: time=60.97\n",
      "\tPred likeli: 0.54315\n",
      "Iteration 3:\n",
      "\tUpdating item factors: time=24.77\n",
      "\tUpdating user factors: time=61.77\n",
      "\tPred likeli: 0.52661\n",
      "Iteration 4:\n",
      "\tUpdating item factors: time=22.60\n",
      "\tUpdating user factors: time=61.54\n",
      "\tPred likeli: 0.51891\n",
      "Iteration 5:\n",
      "\tUpdating item factors: time=20.99\n",
      "\tUpdating user factors: time=62.41\n",
      "\tPred likeli: 0.51467\n",
      "Iteration 6:\n",
      "\tUpdating item factors: time=21.17\n",
      "\tUpdating user factors: time=59.48\n",
      "\tPred likeli: 0.51208\n",
      "Iteration 7:\n",
      "\tUpdating item factors: time=22.45\n",
      "\tUpdating user factors: time=59.69\n",
      "\tPred likeli: 0.51037\n",
      "Iteration 8:\n",
      "\tUpdating item factors: time=24.11\n",
      "\tUpdating user factors..."
     ]
    }
   ],
   "source": [
    "U, V, _ = content_wmf.factorize(S, num_factors, vad=vad, num_iters=num_iters, \n",
    "                                init_std=0.01, lambda_U_reg=lambda_U_reg, lambda_V_reg=lambda_V_reg, \n",
    "                                dtype='float32', random_state=98765, verbose=True, \n",
    "                                recompute_factors=batched_inv_joblib.recompute_factors_batched, \n",
    "                                batch_size=batch_size, n_jobs=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "np.savez('params_wmf_K%d_U%1.E_V%1.E.unpop.npz' % (num_factors, lambda_U_reg, lambda_V_reg), U=U, V=V)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
